#Processing functions

import csv, re, time, math
from lxml.html import clean
import porter
import snowcrawl

(ip, port, pw) = ('127.0.0.1', 8081, 'my_pw')

def fiveWaves(self):
	return self.current_wave > 5

class processing_params( object ):
	stopword_list = file( 'test3_stopwords.txt', 'r' ).read()[:-1].split('\n')

	linear_classifier = {}
	csv_reader = csv.reader( open('test3_classifier.csv') )
	csv_reader.next()
	for token, weight in csv_reader:
		linear_classifier[token] = float(weight)

	porterStemmer = porter.PorterStemmer()

	cleaner = clean.Cleaner( style=True, scripts=True, comments=True, safe_attrs_only=True )
	my_time = time.time

def classifyText( text, params ):
	start_time = params.my_time()
	#clean
	try: text = params.cleaner.clean_html( text )
	except: pass

	text = re.sub('<.*?>', ' ', text )
	text = re.sub('\s+', ' ', text )
	text = text.lower()

	#Tokenize
	tokens = re.findall('[a-z]+', text )

	#Remove stop words
	tokens_2 = []
	for t in tokens:
		if( not t in params.stopword_list ): tokens_2.append(t)

#	print tokens_2

	#Stem
	stems = []
	for t in tokens_2:
		stem = params.porterStemmer.stem( t, 0, len(t)-1 )
		stems.append(stem)

	z = 0#params.linear_classifier['{{intercept}}']+.6
	for s in stems:
		if s in params.linear_classifier:
#			print s, params.linear_classifier[s]
			z += params.linear_classifier[s]

	end_time = params.my_time()
	return ( z<0, [start_time, end_time, len(stems), z, 1/(1+math.exp(-z)), int(z>0)] )


def downloadAndClassifyUrl( (url, params) ):
	(text, download_stats) = snowcrawl.downloadUrl( 'http://'+url )
	(kept, classify_stats) = classifyText( text, params )
	(edges, edge_stats) = snowcrawl.findEdges( url, text )
	if kept:
		return (1, text, edges, edges, download_stats + classify_stats + edge_stats)
	else:
		return (0, text, [], edges, download_stats + classify_stats + edge_stats)
